In [1]:
## Required packages
In [2]:
if (!requireNamespace("spotifyr", quietly = TRUE)) {
  install.packages("spotifyr")
}
In [3]:
library(spotifyr)
In [82]:
# Let's set our Spotify API credentials. For having access to spotifyR functions, the developer needs to connect to Spotify's API. 
# In this file, I cover my Spotify API credentials, as it is not advised to share them.
# The lines of code where I used spotifyr functions are commented out. The file top200 represents the processed data.
In [83]:
# Sys.setenv(SPOTIFY_CLIENT_ID = 'SPOTIFY_CLIENT_ID')
# Sys.setenv(SPOTIFY_CLIENT_SECRET = 'SPOTIFY_CLIENT_SECRET')

# Authenticate and get access token
# access_token <- get_spotify_access_token()

SPOTIFYR PACKAGE¶

In [6]:
# The spotifyr package allows interaction with Spotify's Web API to retrieve data about artists, albums, tracks, and more. It provides functions like get_artist() and get_track() to access detailed metadata, including names, 
# popularity, genres, and unique Spotify IDs. These IDs can be used for further queries, such as fetching discographies, track features, or playlist details.
In [7]:
# Example - Adele's catalog 
# adele <- get_artist("4dpARuHxo51G3z768sgnrY?si=9ff5ce3557664f64")
# adele_id <- adele$id

# Adele's song - "Someone like you"
# id: 3bNv3VuUOKgrf5hu3YcuRo?si=40e9f3fca7f04587
# song <- get_track("3bNv3VuUOKgrf5hu3YcuRo?si=40e9f3fca7f04587")
# song_id <- song$id

REAL DATA PROCESSING¶

In [8]:
# On Spotify Charts website, we can find top200 songs of the current week. I work on the week of 13-20 December 2024.
# URL: https://charts.spotify.com/charts/view/regional-global-weekly/latest
In [84]:
# top200 <- read.csv("regional-global-weekly-2024-12-19.csv")
In [86]:
# Let's add Spotify id column - it's a part of Spotify url's 

# top200$track_id <- sub("spotify:track:", "", top200$uri)
In [85]:
# Let's add popularity column.
# The popularity score in Spotify ranges from 0 to 100, where 100 represents 
# the most popular tracks at a given moment. It is calculated based on recent 
# streaming activity, total play counts, skipping behavior, playlist inclusions, 
# and a time-decay factor. Recent streams have a greater impact, meaning 
# a song's popularity can fluctuate over time.

# pop_list <- c()
# for(id in top200$track_id){
  # track <- get_track(id)
  # pop_list <- c(pop_list, track$popularity)
# }
# pop_list
# top200$popularity <- pop_list
In [1]:
# Let's drop some columns we don't need

# top200$source <- NULL
# top200$uri <- NULL
In [3]:
# Let's add songs duration in ms

# duration_list <- c()
# for(id in top200$track_id){
  # track <- get_track(id)
  # duration_list <- c(duration_list,track$duration_ms)
#}
# top200$duration <- duration_list
In [79]:
# write.csv(top200, "top200.csv", row.names = FALSE)
In [80]:
top200 <- read.csv('top200.csv')
In [14]:
# Data set is ready. Let's make sure there are no missing values
any(is.na(top200))
FALSE
In [15]:
# Basic information and structure of our data
In [16]:
str(top200)
'data.frame':	200 obs. of  10 variables:
 $ rank          : int  1 2 3 4 5 6 7 8 9 10 ...
 $ artist_names  : chr  "Lady Gaga, Bruno Mars" "ROSÉ, Bruno Mars" "Mariah Carey" "Wham!" ...
 $ track_name    : chr  "Die With A Smile" "APT." "All I Want for Christmas Is You" "Last Christmas" ...
 $ peak_rank     : int  1 1 1 2 1 3 4 4 3 3 ...
 $ previous_rank : int  1 2 3 4 5 6 7 9 8 11 ...
 $ weeks_on_chart: int  18 9 64 58 31 50 9 49 4 50 ...
 $ streams       : int  68135080 60486312 51621732 50554131 45417447 44823735 43330956 41055176 35020901 34645959 ...
 $ track_id      : chr  "2plbrEY59IikOBgBGLjaoe" "4wJ5Qq0jBN4ajy7ouZIV1c" "0bYg9bo50gSsH3LtXe2SQn" "2FRnf9qhLbvw8fu4IBXx78" ...
 $ popularity    : int  100 91 86 85 96 84 96 82 88 83 ...
 $ duration      : int  251667 169917 241106 262960 210373 126266 166300 130973 177598 204093 ...
In [17]:
summary(top200)
      rank        artist_names        track_name          peak_rank     
 Min.   :  1.00   Length:200         Length:200         Min.   :  1.00  
 1st Qu.: 50.75   Class :character   Class :character   1st Qu.:  5.00  
 Median :100.50   Mode  :character   Mode  :character   Median : 21.00  
 Mean   :100.50                                         Mean   : 34.69  
 3rd Qu.:150.25                                         3rd Qu.: 50.25  
 Max.   :200.00                                         Max.   :163.00  
 previous_rank    weeks_on_chart      streams           track_id        
 Min.   : -1.00   Min.   :  1.00   Min.   : 8918641   Length:200        
 1st Qu.: 31.75   1st Qu.:  9.75   1st Qu.:10413317   Class :character  
 Median : 82.50   Median : 34.00   Median :12675728   Mode  :character  
 Mean   : 83.69   Mean   : 55.34   Mean   :15728417                     
 3rd Qu.:132.25   3rd Qu.: 66.00   3rd Qu.:17352659                     
 Max.   :200.00   Max.   :377.00   Max.   :68135080                     
   popularity        duration     
 Min.   : 48.00   Min.   : 68760  
 1st Qu.: 78.75   1st Qu.:163012  
 Median : 84.50   Median :187567  
 Mean   : 81.95   Mean   :195064  
 3rd Qu.: 87.00   3rd Qu.:228676  
 Max.   :100.00   Max.   :459766  

DATA SCALING¶

In [18]:
# Some variables such as duration or streams, have large magnitude comparing to variables with smaller scale, ex. popularity or weeks on chart.
# That may lead us to false conclusions.
# Let's scale the data.
In [19]:
# Let's extract numerical variables
numeric_cols <- c("rank", "peak_rank", "previous_rank", "weeks_on_chart", "streams", "popularity", "duration")
In [20]:
# Let's check the skewness of each numerical column
library(e1071)
skewness_values <- sapply(top200[, numeric_cols], skewness)
print(skewness_values)
          rank      peak_rank  previous_rank weeks_on_chart        streams 
     0.0000000      1.3302412      0.1299297      2.2065259      2.8568070 
    popularity       duration 
    -1.6125186      0.7152488 
In [21]:
# We see that many variables are skewed, that's why before scaling, it would be beneficial to log transform them.

# Log transformation (adding constants where needed)
peak_rank_log <- log(top200$peak_rank + 1)
weeks_on_chart_log <- log(top200$weeks_on_chart + 1)
streams_log <- log(top200$streams + 1)
popularity_log <- log(top200$popularity + 2)
In [22]:
# Now, let's create a dataset with only numerical columns of top200 (replace variables 
# with skewed data by log values)
numerical_top200 <- top200[,numeric_cols]
numerical_top200[,'peak_rank'] <- peak_rank_log
numerical_top200[,'weeks_on_chart'] <- weeks_on_chart_log
numerical_top200[,'streams'] <- streams_log
numerical_top200[,'popularity'] <- popularity_log
In [23]:
# Now,let's scale the numerical_top200 dataset
final_scaled_data <- scale(numerical_top200)
In [24]:
# Let's convert the result to a data frame
final_scaled_data <- as.data.frame(final_scaled_data)
In [25]:
# Let's save it as CSV in the working directory
# write.csv(final_scaled_data, "final_scaled_data.csv", row.names = FALSE)

Clustering¶

K -means

In [26]:
library(factoextra)
Warning message:
"pakiet 'factoextra' został zbudowany w wersji R 4.4.2"
Ładowanie wymaganego pakietu: ggplot2

Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

In [27]:
set.seed(123)

# For different levels of k, let's compute the k-means clustering and see the average silhouette scores
compute_silhouette <- function(k) {
  km.out <- kmeans(final_scaled_data, centers = k, nstart = 25)
  km.silhouette <- silhouette(km.out$cluster, dist(final_scaled_data))
  mean(km.silhouette[, 3])
}
In [28]:
# Let's compute silhouette scores for k=2 to 10
library(cluster)
k_values <- 2:10
silhouette_scores <- sapply(k_values, compute_silhouette)
Warning message:
"pakiet 'cluster' został zbudowany w wersji R 4.4.2"
In [29]:
# Let's store the results in a dataframe
silhouette_results <- data.frame(K = k_values, Avg_Silhouette_Score = silhouette_scores)
In [30]:
silhouette_results
A data.frame: 9 × 2
KAvg_Silhouette_Score
<int><dbl>
20.2416964
30.2182421
40.2279560
50.2132007
60.2093079
70.2162810
80.2224289
90.2162346
100.2290832
In [74]:
# The highest average silhouette score is for k=2. It is around 0.241 Let's summarize the k-means clustering with this k=2
In [32]:
km.out <- kmeans(final_scaled_data, centers = 2, nstart = 25) 
fviz_cluster(list(data = final_scaled_data, cluster = km.out$cluster), 
             geom = "point", 
             ellipse.type = "norm",
             main = "K-means Clustering")
No description has been provided for this image
In [33]:
# The silhouette score is not really satisfying as it's only around 0.241. It is visible on the the graph -  the clusters are overlapping. Let's try to do clusters with higher quality.

Hierarchical clustering

In [34]:
# Perform hierarchical clustering using Euclidean distance
hc <- hclust(dist(final_scaled_data), method = "ward.D2")
In [35]:
# Cut the dendrogram to create 3 clusters
hc_clusters <- cutree(hc, k = 2)

# Plot the dendrogram
fviz_dend(hc, k = 2, rect = TRUE, show_labels = FALSE)
Warning message:
"The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as of ggplot2 3.3.4.
ℹ The deprecated feature was likely used in the factoextra package.
  Please report the issue at <https://github.com/kassambara/factoextra/issues>."
No description has been provided for this image
In [36]:
# Visualize the clusters
fviz_cluster(list(data = final_scaled_data, cluster = hc_clusters), geom = "point", ellipse.type = "norm", main = "Hierarchical Clustering")
No description has been provided for this image
In [37]:
# Evaluate the quality of the clusters with Silhouette score
silhouette_hc <- silhouette(hc_clusters, dist(final_scaled_data))
mean(silhouette_hc[,3])
0.24633041967291
In [38]:
# Quality comparable to k-means

DBSCAN

In [39]:
library("fpc")
library("dbscan")
Warning message:
"pakiet 'fpc' został zbudowany w wersji R 4.4.2"
Warning message:
"pakiet 'dbscan' został zbudowany w wersji R 4.4.2"

Dołączanie pakietu: 'dbscan'


Następujący obiekt został zakryty z 'package:fpc':

    dbscan


Następujący obiekt został zakryty z 'package:spotifyr':

    tidy


Następujący obiekt został zakryty z 'package:stats':

    as.dendrogram


In [40]:
# DBSCAN Clustering
db <- fpc::dbscan(final_scaled_data, eps = 2, MinPts = 3)
fviz_cluster(db, final_scaled_data, geom = "point")
No description has been provided for this image
In [41]:
# The density based clustering algorithm claims there is only one cluster and outliers (cluster 0).
# Let's see if this data is clusterable using the Hopkins statistic.
In [42]:
library(clustertend)
hop_stat <- hopkins(final_scaled_data, n=nrow(final_scaled_data)-1)
hop_stat
Package `clustertend` is deprecated.  Use package `hopkins` instead.

Warning message in hopkins(final_scaled_data, n = nrow(final_scaled_data) - 1):
"Package `clustertend` is deprecated.  Use package `hopkins` instead."
$H = 0.240954180523589
In [43]:
# The Hopkins statistic of about 0,24 indicates that data may not be suitable for clustering. Its distribution may be close to uniform.
# The last trial to uncover some patterns in data would consist of dimension reduction. Let's try PCA dimension reduction technique and then perform clustering on a reduced data space.

PCA Dimension reduction¶

In [44]:
set.seed(42)
pca_result <- prcomp(final_scaled_data, scale. = TRUE)
In [45]:
# Let's see how much variance each component explains
plot(pca_result$sdev^2 / sum(pca_result$sdev^2), type = "b", 
    xlab = "Principal Component", ylab = "Variance Explained")
No description has been provided for this image
In [46]:
# Let's keep the first 3 principal components (since they explain ~75% variance)
pca_data <- pca_result$x[, 1:3]

K-means on PCA data

In [47]:
set.seed(123)
km_pca <- kmeans(pca_data, centers = 2, nstart = 25)
In [48]:
# Convert PCA data to dataframe and add cluster labels
pca_clusters <- as.data.frame(pca_data)
pca_clusters$cluster <- factor(km_pca$cluster)
In [49]:
head(pca_clusters)
A data.frame: 6 × 4
PC1PC2PC3cluster
<dbl><dbl><dbl><fct>
1-4.657720 0.7542685 1.59257621
2-4.388381-0.4361702 0.20983541
3-4.032944 0.9905556 0.27536351
4-3.842431 0.9258339 0.62588071
5-3.986567 0.7249615 0.64518871
6-3.594548-0.2540533-1.15345061
In [50]:
library(plotly)

plot_ly(pca_clusters, x = ~PC1, y = ~PC2, z = ~PC3, color = ~cluster, colors = "Dark2",
        type = "scatter3d", mode = "markers", marker = list(size = 5, opacity = 0.8)) %>%
  layout(title = "3D Visualization of K-Means Clustering on PCA Data")
Warning message:
"pakiet 'plotly' został zbudowany w wersji R 4.4.2"

Dołączanie pakietu: 'plotly'


Następujący obiekt został zakryty z 'package:ggplot2':

    last_plot


Następujący obiekt został zakryty z 'package:stats':

    filter


Następujący obiekt został zakryty z 'package:graphics':

    layout


In [51]:
silhouette_score <- silhouette(km_pca$cluster, dist(pca_data))
mean(silhouette_score[, 3])
0.324631722499112
In [52]:
# After reducing dimensionality with PCA, the clusters quality (measured in silhouette score) improved. It's about 0,32

Hierarchical clustering

In [53]:
set.seed(42)
hclust_res <- hclust(dist(pca_data), method = "ward.D2")
In [54]:
hclust_clusters <- cutree(hclust_res, k = 2)
In [55]:
pca_clusters$hclust_cluster <- factor(hclust_clusters)

plot_ly(pca_clusters, x = ~PC1, y = ~PC2, z = ~PC3, color = ~hclust_cluster, colors = "Dark2",
        type = "scatter3d", mode = "markers", marker = list(size = 5, opacity = 0.8)) %>%
  layout(title = "Hierarchical Clustering on PCA-Reduced Data")
In [56]:
sil_hclust <- silhouette(hclust_clusters, dist(pca_data))
cat("Hierarchical Clustering Silhouette Score:", mean(sil_hclust[, 3]), "\n")
Hierarchical Clustering Silhouette Score: 0.2682708 
In [57]:
# Let's try one more - PAM
library(cluster)
set.seed(123)
pam_pca <- pam(pca_data, k = 2)

pca_clusters$cluster_pam <- factor(pam_pca$clustering)

plot_ly(pca_clusters, x = ~PC1, y = ~PC2, z = ~PC3, color = ~cluster, colors = "Dark2",
        type = "scatter3d", mode = "markers", marker = list(size = 5, opacity = 0.8)) %>%
  layout(title = "3D Visualization of PAM Clustering on PCA Data")

silhouette_score <- silhouette(pam_pca$clustering, dist(pca_data))
mean(silhouette_score[, 3])
0.332452275441615
In [58]:
# Silhouette slightly better than k-mean (0,33 compared to 0,32)

Conclusion¶

In [59]:
# PAM clustering after PCA dimension reduction has the highest quality.
In [60]:
head(pca_clusters)
A data.frame: 6 × 6
PC1PC2PC3clusterhclust_clustercluster_pam
<dbl><dbl><dbl><fct><fct><fct>
1-4.657720 0.7542685 1.5925762111
2-4.388381-0.4361702 0.2098354111
3-4.032944 0.9905556 0.2753635111
4-3.842431 0.9258339 0.6258807111
5-3.986567 0.7249615 0.6451887111
6-3.594548-0.2540533-1.1534506111
In [61]:
top200$cluster <- pca_clusters$cluster_pam
head(top200)
A data.frame: 6 × 11
rankartist_namestrack_namepeak_rankprevious_rankweeks_on_chartstreamstrack_idpopularitydurationcluster
<int><chr><chr><int><int><int><int><chr><int><int><fct>
11Lady Gaga, Bruno MarsDie With A Smile 1118681350802plbrEY59IikOBgBGLjaoe1002516671
22ROSÉ, Bruno Mars APT. 12 9604863124wJ5Qq0jBN4ajy7ouZIV1c 911699171
33Mariah Carey All I Want for Christmas Is You 1364516217320bYg9bo50gSsH3LtXe2SQn 862411061
44Wham! Last Christmas 2458505541312FRnf9qhLbvw8fu4IBXx78 852629601
55Billie Eilish BIRDS OF A FEATHER 1531454174476dOtVTDdiauQNBQEDOtlAB 962103731
66Brenda Lee Rockin' Around The Christmas Tree3650448237352EjXfH91m7f8HiJN1yQg97 841262661
In [62]:
table(top200$cluster)
  1   2 
 59 141 
In [63]:
# Cluster 1
In [64]:
cl1 <- top200[top200$cluster == 1,]
In [65]:
summary(cl1)
      rank      artist_names        track_name          peak_rank    
 Min.   : 1.0   Length:59          Length:59          Min.   : 1.00  
 1st Qu.:15.5   Class :character   Class :character   1st Qu.: 3.00  
 Median :30.0   Mode  :character   Mode  :character   Median : 6.00  
 Mean   :31.0                                         Mean   :10.25  
 3rd Qu.:45.5                                         3rd Qu.:15.00  
 Max.   :71.0                                         Max.   :42.00  
 previous_rank   weeks_on_chart      streams           track_id        
 Min.   : 1.00   Min.   :  2.00   Min.   :14817404   Length:59         
 1st Qu.:15.50   1st Qu.:  9.00   1st Qu.:17666381   Class :character  
 Median :30.00   Median : 36.00   Median :21790676   Mode  :character  
 Mean   :32.54   Mean   : 35.25   Mean   :25671738                     
 3rd Qu.:45.50   3rd Qu.: 46.50   3rd Qu.:28846120                     
 Max.   :87.00   Max.   :128.00   Max.   :68135080                     
   popularity        duration      cluster
 Min.   : 48.00   Min.   :117146   1:59   
 1st Qu.: 82.50   1st Qu.:166100   2: 0   
 Median : 88.00   Median :190973          
 Mean   : 84.85   Mean   :194266          
 3rd Qu.: 90.00   3rd Qu.:219557          
 Max.   :100.00   Max.   :284066          
In [66]:
# Cluster 2
In [67]:
cl2 <- top200[top200$cluster == 2,]
In [68]:
summary(cl2)
      rank       artist_names        track_name          peak_rank     
 Min.   : 44.0   Length:141         Length:141         Min.   :  1.00  
 1st Qu.: 95.0   Class :character   Class :character   1st Qu.: 12.00  
 Median :130.0   Mode  :character   Mode  :character   Median : 33.00  
 Mean   :129.6                                         Mean   : 44.91  
 3rd Qu.:165.0                                         3rd Qu.: 70.00  
 Max.   :200.0                                         Max.   :163.00  
 previous_rank   weeks_on_chart      streams           track_id        
 Min.   : -1.0   Min.   :  1.00   Min.   : 8918641   Length:141        
 1st Qu.: 73.0   1st Qu.: 10.00   1st Qu.:10016597   Class :character  
 Median :112.0   Median : 32.00   Median :11137041   Mode  :character  
 Mean   :105.1   Mean   : 63.74   Mean   :11567736                     
 3rd Qu.:147.0   3rd Qu.: 95.00   3rd Qu.:12829538                     
 Max.   :200.0   Max.   :377.00   Max.   :17962947                     
   popularity       duration      cluster
 Min.   :49.00   Min.   : 68760   1:  0  
 1st Qu.:76.00   1st Qu.:162767   2:141  
 Median :84.00   Median :187520          
 Mean   :80.74   Mean   :195398          
 3rd Qu.:86.00   3rd Qu.:232186          
 Max.   :89.00   Max.   :459766          

Interpretation¶

In [75]:
# It was challenging to find distinguishing groups in the top200 songs. All the songs in the ranking are very similar - the distribution is close to uniform. 
# However, let’s see the characteristics of the two clusters that may give us a hint about which two types of songs appear on the top200 ranking.
In [70]:
# Cluster 1
In [71]:
# It contains 59 observations—the highest-ranking songs.
# The mean rank is 31, ranging from 1st to 71st place.
# The songs in this cluster are relatively new, with an average presence of 35 weeks on the chart.
# The freshest hit has been on the ranking for just 2 weeks, while the longest-lasting song in this group has remained for 128 weeks.
# The average number of streams is 25M, but this value is influenced by outliers — the median is 21M.
# The average popularity score is 84/100, indicating that this cluster consists of recent global hits..
In [72]:
# Cluster 2
In [4]:
# It contains 141 songs. The mean rank is 129, which is significantly lower than in Cluster 1.
# In terms of weeks on the chart, the average is 63 weeks — twice as long as in Cluster 1.
# The average number of streams is 11M, considerably lower than in Cluster 1.
# The popularity score is similar (80 vs. 84), suggesting that popularity and streams alone does not determine chart position. Fans devotion of niche songs is also important.
# The Top 200 list is dominated by Cluster 2 songs, which have reached some level of maturity (measured in weeks) and continue to be consistently played by fans.